Data Analysis

In [257]:
#导入模块
import pandas as pd
import numpy as np

#使用算法:逻辑回归,梯度下降分类,线性回归,k近邻分类,朴素贝叶斯
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

#创建特征列表表头
#column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
column_names = ['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst']
#使用pandas.read_csv函数从网上读取数据集
#data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',names=column_names)
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',names=column_names)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-null float64
concave points_se          569 non-null float64
symmetry_se                569 non-null float64
fractal_dimension_se       569 non-null float64
radius_worst               569 non-null float64
texture_worst              569 non-null float64
perimeter_worst            569 non-null float64
area_worst                 569 non-null float64
smoothness_worst           569 non-null float64
compactness_worst          569 non-null float64
concavity_worst            569 non-null float64
concave points_worst       569 non-null float64
symmetry_worst             569 non-null float64
fractal_dimension_worst    569 non-null float64
dtypes: float64(30), int64(1), object(1)
memory usage: 142.3+ KB

属性信息:

  • 1)身份证号码
  • 2)诊断(M =恶性,B =良性)

  • 3-32为每个细胞核计算十个实值特征:

    • a)半径(从中心到周边点的距离的平均值)
    • b)纹理(灰度值的标准偏差)
    • c)周界
    • d)区域
    • e)光滑度(半径长度的局部变化)
    • f)紧凑性(周长^ 2 /面积 - 1.0)
    • g)凹度(轮廓凹部的严重程度)
    • h)凹点(轮廓的凹入部分的数量)
    • i)对称
    • j)分形维数(“海岸线近似” - 1)
  • 对每个数据分别求平均值,标准误差,“最差”或最大,产生30个特征。
  • 所有功能值都用四位有效数字重新编码。
In [253]:
df.head(10)
Out[253]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678
5 843786 M 12.45 15.70 82.57 477.1 0.12780 0.17000 0.15780 0.08089 ... 15.47 23.75 103.40 741.6 0.1791 0.5249 0.5355 0.1741 0.3985 0.12440
6 844359 M 18.25 19.98 119.60 1040.0 0.09463 0.10900 0.11270 0.07400 ... 22.88 27.66 153.20 1606.0 0.1442 0.2576 0.3784 0.1932 0.3063 0.08368
7 84458202 M 13.71 20.83 90.20 577.9 0.11890 0.16450 0.09366 0.05985 ... 17.06 28.14 110.60 897.0 0.1654 0.3682 0.2678 0.1556 0.3196 0.11510
8 844981 M 13.00 21.82 87.50 519.8 0.12730 0.19320 0.18590 0.09353 ... 15.49 30.73 106.20 739.3 0.1703 0.5401 0.5390 0.2060 0.4378 0.10720
9 84501001 M 12.46 24.04 83.97 475.9 0.11860 0.23960 0.22730 0.08543 ... 15.09 40.68 97.65 711.4 0.1853 1.0580 1.1050 0.2210 0.4366 0.20750

10 rows × 32 columns

In [247]:
sn.countplot(df["diagnosis"],label='count')
Out[247]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ea28265da0>
In [258]:
## id 不用于classification,删去,diagnosis为分类标准,进行二值化处理(M=1,B=0)
# feature的名字并不需要了解 because I believe machine learning is awesome :)
# df['diagnosis'] = pd.get_dummies(df['diagnosis'],drop_first=True)
dia = df['diagnosis']
df.drop('id', axis=1,inplace=True)
df.drop('diagnosis',axis=1,inplace=True)
In [262]:
data = df
data.describe()
Out[262]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 0.062798 ... 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 0.007060 ... 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 0.049960 ... 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 0.057700 ... 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 0.061540 ... 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 0.066120 ... 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 0.097440 ... 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500

8 rows × 30 columns

Data Visualization

In [203]:
#构建散布矩阵(scatter matrix)
#通过散布矩阵可以看出在这个数据特征和其它特征中有关联性
pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (80,60), diagonal = 'kde');
In [265]:
#相似相关性热力图
f,ax = plt.subplots(figsize=(20, 20))
sn.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax,cmap='coolwarm')
Out[265]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ea28300518>

correlation features

  • Compactness_mean, concavity_mean, concave points_mean
  • radius_se, perimeter_se, area_se
  • radius_worst, perimeter_worst, area_worst
  • Compactness_worst, concavity_worst, concave points_worst
  • Compactness_se, concavity_se, concave points_se
  • texture_mean, texture_worst
  • area_worst, area_mean

Classification

In [263]:
#使用sklearn.cross_validation里的train_test_split模块分割数据集
from sklearn.cross_validation import train_test_split

#随机采样25%的数据用于测试,剩下的75%用于构建训练集
X_train,X_test,y_train,y_test = train_test_split(data,dia,test_size = 0.25,random_state = 33)

#查看训练样本的数量和类别分布
y_train.value_counts()
Out[263]:
B    268
M    158
Name: diagnosis, dtype: int64
In [239]:
X_train.shape
Out[239]:
(426, 30)
In [240]:
y_train.shape
Out[240]:
(426,)
In [264]:
LR = LogisticRegression()
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_test)
print(metrics.classification_report(y_test,LR_pred, digits = 5))
#LR_cm=metrics.confusion_matrix(y_test,LR_pred)
#LR.score(X_test,y_test)
             precision    recall  f1-score   support

          B    0.94624   0.98876   0.96703        89
          M    0.98000   0.90741   0.94231        54

avg / total    0.95899   0.95804   0.95770       143

In [223]:
#normalization
x_train_N = (X_train-X_train.mean())/(X_train.max()-X_train.min())
x_test_N = (X_test-X_test.mean())/(X_test.max()-X_test.min())
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train_N)

plt.figure(1, figsize=(14, 13))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')
Out[223]:
Text(0,0.5,'explained_variance_ratio_')

Other

In [180]:
#标准化操作,分离均值,标准差,极值
#data_std = (data - data.mean()) / (data.std())
data_mean = data_std.iloc[:,0:10]
data_se = data_std.iloc[:,10:20]
data_worst = data_std.iloc[:,20:30]
In [181]:
data_mean.shape
Out[181]:
(569, 10)
In [182]:
# 均值的标准化分布
data_mean_1 = pd.concat([data_mean,dia],axis=1)
data_mean_2 = pd.melt(data_mean_1,id_vars='diagnosis',var_name='features',value_name='value')
plt.figure(figsize=(20,10))
sn.violinplot(x='features', y='value', hue='diagnosis', data=data_mean_2,split=True,inner='quart')
Out[182]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ea0e4494a8>
In [183]:
# 标准差的分布
data_se_1 = pd.concat([data_se,dia],axis=1)
data_se_2 = pd.melt(data_se_1,id_vars='diagnosis',var_name='features',value_name='value')
plt.figure(figsize=(20,10))
sn.violinplot(x='features', y='value', hue='diagnosis', data=data_se_2,split=True,inner='quart')
Out[183]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ea0d4710f0>
In [184]:
# 极值分布
data_worst_1 = pd.concat([data_worst,dia],axis=1)
data_worst_2 = pd.melt(data_worst_1,id_vars='diagnosis',var_name='features',value_name='value')
plt.figure(figsize=(20,10))
sn.violinplot(x='features', y='value', hue='diagnosis', data=data_worst_2,split=True,inner='quart')
Out[184]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ea083e2eb8>